./../SFARI_genes_01-15-2019.csv of SFARI genes information downloaded from https://gene.sfari.org/database/gene-scoring/. Downloaded version 01-15-19.
SFARI_scraping/genes/gene_name.csv file for each of the SFARI genes with the information of all the reports related to it obtained using the SFARI_scraper.py script and scrapy package.
MeSH_terms_by_paper.json file with all the MeSH terms associated to each of the articles from the SFARI_scraper output files. Retrieved from the NCBI e-utils api using the get_articles_metadata.R script and reutils package. Downloaded on 24/05/19.
List of qualifiers/subheadings by paper extracted from the MeSH terms
### GET LIST OF GENES
list_gene_files = list.files('./../SFARI_scraping/genes/')
# GET LIST OF PUBMED IDS
get_pubmed_IDs = function(){
all_pubmed_ids = c()
for(gene_file in list_gene_files){
file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
pubmed_ids = unique(file$pubmed.ID)
all_pubmed_ids = unique(c(all_pubmed_ids, pubmed_ids))
}
all_pubmed_ids = as.character(all_pubmed_ids)
return(all_pubmed_ids)
}
all_pubmed_ids = get_pubmed_IDs()
# CREATE GENE-ARTICLE (SPARSE) MATRIX
SFARI_genes_info = read.csv('./../../Data/SFARI_genes_01-15-2019.csv') %>% mutate('ID'=gene.symbol)
create_gene_pmID_mat = function(){
# Create empty dataframe
gene_pmID_mat = data.frame(matrix(0, nrow=length(list_gene_files), ncol=length(all_pubmed_ids)))
rownames(gene_pmID_mat) = sapply(list_gene_files, function(x) gsub('.csv', '', x))
colnames(gene_pmID_mat) = all_pubmed_ids
# Fillout dataframe
for(gene_file in list_gene_files){
file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
pubmed_ids = as.character(unique(file$pubmed.ID))
gene_pmID_mat[gsub('.csv', '', gene_file), pubmed_ids] = 1
}
sparse_gene_pmID_mat = Matrix(as.matrix(gene_pmID_mat), sparse=TRUE)
return(sparse_gene_pmID_mat)
}
gene_pmID_mat = create_gene_pmID_mat()
print(paste0('Genes: ', nrow(gene_pmID_mat), ' Articles: ', ncol(gene_pmID_mat)))
## [1] "Genes: 1053 Articles: 3709"
rm(list_gene_files)
# LOAD AND PREPROCESS MESH TERMS
get_qualifiers = function(mesh_terms_by_article_list){
qualifiers = c('abnormalities', 'administration & dosage', 'adverse effects', 'analogs & derivatives', 'analysis',
'anatomy & histology', 'antagonists & inhibitors', 'biosynthesis', 'blood supply', 'cerebrospinal fluid',
'chemical synthesis', 'chemically induced', 'classification', 'complications', 'congenital',
'cytology', 'deficiency', 'diagnosis', 'diagnostic imaging', 'diet therapy', 'drug effects', 'drug therapy',
'economics', 'education', 'embryology', 'enzymology', 'epidemiology', 'ethics', 'ethnology', 'etiology',
'growth & development', 'history', 'immunology', 'injuries', 'innervation', 'instrumentation',
'isolation & purification', 'legislation & jurisprudence', 'manpower', 'metabolism', 'methods', 'microbiology',
'mortality', 'nursing', 'organization & administration', 'parasitology', 'pathology', 'pharmacokinetics',
'pharmacology', 'physiology', 'physiopathology', 'poisoning', 'prevention & control', 'psychology',
'radiation effects', 'radiotherapy', 'rehabilitation', 'secondary', 'secretion', 'standards',
'statistics & numerical data', 'supply & distribution', 'surgery', 'therapeutic use', 'therapy', 'toxicity',
'transmission', 'trends', 'ultrastructure', 'urine', 'utilization', 'veterinary', 'virology')
qualifiers_by_article_list = list()
for(article in names(mesh_terms_by_article_list)){
mesh_terms = mesh_terms_by_article_list[[article]]
article_qualifiers = c()
if(!is.na(mesh_terms[1])){
for(mesh_term in mesh_terms){
# Look for qualifiers
match_general = sapply(qualifiers, grepl, mesh_term)
match_agonists = grepl('agonists', mesh_term) & !grepl('ntagonists', mesh_term)
match_blood = grepl('blood', mesh_term) & !grepl('blood supply', mesh_term)
match_chemistry = grepl('chemistry', mesh_term) & !grepl('immunohistochemistry', mesh_term)
match_genetics = grepl('genetics', mesh_term) & !grepl('Xgenetics', mesh_term)
# Add matches to the qualifiers list
if(match_agonists) article_qualifiers = c(article_qualifiers, 'agonists')
if(match_blood) article_qualifiers = c(article_qualifiers, 'blood')
if(match_chemistry) article_qualifiers = c(article_qualifiers, 'chemistry')
if(match_genetics) article_qualifiers = c(article_qualifiers, 'genetics')
if(sum(match_general)>0) article_qualifiers = c(article_qualifiers, names(match_general)[match_general==TRUE])
}
# Update article entries
qualifiers_by_article_list[[article]] = unique(article_qualifiers)
}
else qualifiers_by_article_list[[article]] = NA
}
return(qualifiers_by_article_list)
}
create_article_qualifier_mat = function(qualifiers_by_article_list){
# GET LIST OF ARTICLES
all_articles = names(qualifiers_by_article_list)
# GET LIST OF QUALIFIERS
all_qualifiers = c()
for(mts in qualifiers_by_article_list) all_qualifiers = unique(c(all_qualifiers, mts))
# CREATE ARTICLE-QUALIFIER MATRIX
article_qualifier_mat = data.frame(matrix(0, nrow=length(all_articles), ncol=sum(!is.na(all_qualifiers))))
rownames(article_qualifier_mat) = all_articles
colnames(article_qualifier_mat) = all_qualifiers[!is.na(all_qualifiers)]
for(article in all_articles){
article_mesh_terms = qualifiers_by_article_list[[article]]
if(sum(is.na(article_mesh_terms))==0){
article_qualifier_mat[article, article_mesh_terms] = 1
}
}
article_qualifier_mat = Matrix(as.matrix(article_qualifier_mat), sparse=TRUE)
return(article_qualifier_mat)
}
mesh_terms_by_article_list = read_json('./../Data/MeSH_terms_by_paper.json', simplifyVector=TRUE)
qualifiers_by_article_list = get_qualifiers(mesh_terms_by_article_list)
article_qualifier_mat = create_article_qualifier_mat(qualifiers_by_article_list)
print(paste0('Articles: ', nrow(article_qualifier_mat), ' Qualifiers: ', ncol(article_qualifier_mat)))
## [1] "Articles: 3707 Qualifiers: 65"
gene_pmID_mat = gene_pmID_mat[,colnames(gene_pmID_mat) %in% rownames(article_qualifier_mat)]
# Check that ordering is the same in both matrices
if(!all(colnames(gene_pmID_mat) == rownames(article_qualifier_mat))){
print("Article ordering doesn't match")
}
gene_qualifier_mat = gene_pmID_mat %*% article_qualifier_mat
print(paste0('Genes: ', nrow(gene_qualifier_mat), ' Qualifiers: ', ncol(gene_qualifier_mat)))
## [1] "Genes: 1053 Qualifiers: 65"
to_keep = rowSums(gene_qualifier_mat)>0
print(paste0('Removing ', sum(!to_keep), ' genes'))
## [1] "Removing 14 genes"
SFARI score distribution of removed genes:
SFARI_genes_info$gene.score[!SFARI_genes_info$ID %in% rownames(gene_qualifier_mat)[to_keep]] %>% table(useNA='ifany')
## .
## 3 4 5
## 3 6 5
# Remove genes
gene_qualifier_mat = gene_qualifier_mat[to_keep,]
rm(to_keep)
genes_by_qualifier = data.frame('ID'=colnames(gene_qualifier_mat), 'n_genes'=colSums(gene_qualifier_mat))
ggplotly(genes_by_qualifier %>% ggplot(aes(reorder(ID, n_genes), n_genes, fill=sqrt(n_genes))) + geom_bar(stat='identity') +
ggtitle('No. Genes related to each Qualifier') + coord_flip() + scale_y_sqrt() +
xlab('Qualifier') + ylab('No. Genes') + theme_minimal() + theme(legend.position = 'none'))
rm(genes_by_qualifier)
The higher the SFARI score, the more qualifiers it contains (makes sense because there was a relation between SFARI score and number of publications). The same relation exists between syndromic and non-syndromic genes.
qualifiers_by_gene = data.frame('ID'=rownames(gene_qualifier_mat), 'n_qualifiers'=rowSums(gene_qualifier_mat)) %>%
left_join(SFARI_genes_info, by='ID') %>% dplyr::select(ID, n_qualifiers, gene.score, syndromic) %>%
mutate(gene.score = as.factor(gene.score), syndromic=as.factor(as.logical(syndromic)))
p1 = ggplotly(qualifiers_by_gene %>% ggplot(aes(gene.score, n_qualifiers, fill=gene.score)) + geom_boxplot() +
xlab('Gene Score') + ylab('No. Qualifiers') + theme_minimal() + theme(legend.position='none') +
scale_fill_manual(values=SFARI_colour_hue()) + scale_color_manual(values=SFARI_colour_hue()))
p2 = ggplotly(qualifiers_by_gene %>% ggplot(aes(syndromic, n_qualifiers, fill=syndromic)) + geom_boxplot() +
ggtitle('Qualifiers related to each gene by score (left) and syndromic tag (right)') +
xlab('Syndromic') + ylab('No. Qualifiers') + theme_minimal() + theme(legend.position='none'))
subplot(p1, p2, nrows=1, widths=c(0.7,0.3))
rm(p1, p2)
Since scores 5 and 6 have no syndromic genes, this tag can be a confounder, but even when separating the genes both by score and syndromic tag, the relation persists
ggplotly(qualifiers_by_gene %>% ggplot(aes(gene.score, n_qualifiers, fill=gene.score)) + geom_boxplot() +
ggtitle('Qualifiers related to each gene by score and syndromic tag') +
xlab('Gene Score') + ylab('No. Qualifiers') + facet_wrap(~syndromic) +
scale_fill_manual(values=SFARI_colour_hue()) + scale_color_manual(values=SFARI_colour_hue()) +
theme_minimal() + theme(legend.position='none'))
rm(qualifiers_by_gene)
On average, each node is connected to half of the nodes -> strongly connected Network
# EDGES
gene_gene_mat = gene_qualifier_mat %*% t(gene_qualifier_mat)
gene_names = rownames(gene_qualifier_mat)
edges = summary(gene_gene_mat) %>% rename('from'=i, 'to'=j, 'count'=x) %>%
mutate(from = gene_names[from], to = gene_names[to]) %>%
filter(from!=to)
# Convert count to fraction (multiplying by 100 to make the numbers bigger)
gene_qualifier_mat_rowsums = rowSums(gene_qualifier_mat)
names(gene_qualifier_mat_rowsums) = rownames(gene_gene_mat)
edges = edges %>% mutate('weight'=count/(gene_qualifier_mat_rowsums[from]+gene_qualifier_mat_rowsums[to]))
# NODES
nodes = SFARI_genes_info %>% dplyr::select(gene.symbol, gene.score, syndromic, number.of.reports) %>%
mutate(id = gene.symbol, title=gene.symbol, value=number.of.reports,
shape=ifelse(syndromic==0, 'circle','square'),
color_score=case_when(gene.score==1~'#FF7631', gene.score==2~'#FFB100',
gene.score==3~'#E8E328', gene.score==4~'#8CC83F',
gene.score==5~'#62CCA6', gene.score==6~'#59B9C9',
is.na(gene.score) ~ '#b3b3b3')) %>%
mutate('color'=color_score) %>% filter(!duplicated(id)) %>%
filter(gene.symbol %in% unique(c(edges$from, edges$to)))
# Details:
print(paste0('Nodes: ', nrow(nodes),' Edges: ', nrow(edges)/2))
## [1] "Nodes: 1039 Edges: 537956"
The higher the SFARI score, the higher the eigencentrality of the genes in the Network. Same with syndromic tag
graph = graph_from_data_frame(edges, vertices=nodes, directed=FALSE)
graph = graph %>% simplify(edge.attr.comb='first')
eigen_centrality_output = eigen_centrality(graph)
eigencentr = as.numeric(eigen_centrality_output$vector)
graph = graph %>% set_vertex_attr('eigencentrality', value=eigencentr)
plot_data = data.frame('gene.score'= as.factor(vertex_attr(graph, 'gene.score')),
'Syndromic'= as.logical(vertex_attr(graph, 'syndromic')),
'Eigencentrality'= vertex_attr(graph, 'eigencentrality'))
correlation_scr = cor(nodes$gene.score[!is.na(nodes$gene.score)], eigencentr[!is.na(nodes$gene.score)])
correlation_syn = cor(as.numeric(plot_data$Syndromic), plot_data$Eigencentrality)
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, Eigencentrality, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() + theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(Syndromic, Eigencentrality, fill=Syndromic)) + geom_boxplot() +
theme_minimal() + theme(legend.position='none') +
ggtitle(paste0('Correlation = ', round(correlation_scr, 2), ' (left) and ', round(correlation_syn,2), ' (right)')))
subplot(p1, p2, nrows=1, widths=c(0.7, 0.3))
rm(correlation_scr, correlation_syn, p1, p2)
When separating the SFARI scores by syndromic tag, the relation between eigencentrality and score persists
corr_non_syn = cor(nodes$gene.score[!is.na(nodes$gene.score) & nodes$syndromic==0],
eigencentr[!is.na(nodes$gene.score) & nodes$syndromic==0])
corr_syn = cor(nodes$gene.score[!is.na(nodes$gene.score) & nodes$syndromic==1],
eigencentr[!is.na(nodes$gene.score) & nodes$syndromic==1])
ggplotly(plot_data %>% ggplot(aes(gene.score, Eigencentrality, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() + theme(legend.position='none') +
facet_wrap( ~ Syndromic, ncol=2) + ggtitle(paste0('Correlation = ', round(corr_non_syn, 4),
' (left) and ', round(corr_syn, 4), ' (right)')))
rm(plot_data, corr_non_syn, corr_syn, nodes, edges)
plot_data = data.frame('id' = graph %>% get.vertex.attribute('name'), 'eigencentrality' = eigencentr) %>%
top_n(25, wt=eigencentrality) %>% left_join(SFARI_genes_info, by=c('id'='ID'))
## Warning: Column `id`/`ID` joining factors with different levels, coercing
## to character vector
ggplotly(plot_data %>% ggplot(aes(reorder(id, eigencentrality), eigencentrality, fill=as.factor(gene.score))) +
geom_bar(stat='identity') + scale_fill_manual(values=SFARI_colour_hue(), na.value='#b3b3b3') +
coord_flip() + ggtitle(paste0('Top 25 genes with the highest Network centrality')) +
xlab('') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
rm(plot_data)
Note: Network is too densely connected to plot (too heavy and not very informative)
comms_louvain = cluster_louvain(graph)
modules = comms_louvain %>% membership
color_pal = viridis_pal()(max(modules)) %>% substr(1,7)
graph = graph %>% set_vertex_attr('module', value=as.numeric(modules)) %>%
set_vertex_attr('color_modules', value=color_pal[as.numeric(modules)]) %>%
set_vertex_attr('color', value=color_pal[as.numeric(modules)])
comm_sizes = sizes(comms_louvain) %>% data.frame %>% rename(Module=Community.sizes, size=Freq)
ggplotly(comm_sizes %>% ggplot(aes(Module, size, fill=size)) + geom_bar(stat='identity') +
ggtitle('Number of genes in each module') + theme_minimal() + theme(legend.position = 'none'))
hm_data = table(vertex_attr(graph, 'gene.score'), vertex_attr(graph, 'module'), useNA='ifany') %>%
as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', Rowv=F, Colv=F, scale='row', trace='none',
key=FALSE, xlab='Module', ylab='SFARI score', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))
hm_data = table(vertex_attr(graph, 'syndromic'), vertex_attr(graph, 'module'), useNA='ifany') %>%
as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', Rowv=F, Colv=F, scale='row', trace='none',
key=FALSE, xlab='Module', ylab='Syndromic', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))
Separating by syndromic tag the SFARI scores
hm_data = data.frame('syndromic_gene.score'=paste(vertex_attr(graph, 'syndromic'), '_',vertex_attr(graph, 'gene.score')),
'module'=vertex_attr(graph, 'module'))
hm_data = table(hm_data$syndromic_gene.score, hm_data$module) %>% as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', scale='row', trace='none',
key=FALSE, xlab='Module', ylab='Syndromic tag + SFARI Score', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))
rm(color_pal, hm_data)
Most important genes for each module coloured by SFARI score
module_info = tibble('module'=character(), 'top_term'=character(), size=integer())
for(i in names(sizes(comms_louvain))){
module_vertices = names(modules)[modules==i]
# Creat subgraph
module_graph = induced_subgraph(graph, module_vertices)
# Calculate eigencentrality
eigen_centrality_output = eigen_centrality(module_graph)
eigencentr = as.numeric(eigen_centrality_output$vector)
names(eigencentr) = module_vertices
module_info = module_info %>% add_row('module'=i, 'top_term'=names(eigencentr)[eigencentr==max(eigencentr)][1],
'size'=length(module_vertices))
# Plot eigencentrality of top 10 genes
plot_data = data.frame('gene'=names(eigencentr), 'eigencentrality'=eigencentr) %>%
top_n(10, wt=eigencentrality) %>% left_join(SFARI_genes_info, by=c('gene'='ID'))
print(plot_data %>% ggplot(aes(reorder(gene, eigencentrality), eigencentrality, fill=as.factor(gene.score))) +
geom_bar(stat='identity') + scale_fill_manual(values=SFARI_colour_hue(), na.value='#b3b3b3') + coord_flip() +
ggtitle(paste0('Top genes for Module ', i)) + xlab('Genes') + ylab('Eigencentrality') +
theme_minimal() + theme(legend.position='none'))
}
rm(i, module_vertices, module_graph, eigen_centrality_output, eigencentr, plot_data)
Most important Qualifiers for each module
for(i in names(sizes(comms_louvain))){
module_genes = names(modules)[modules==i]
# Filter genes belonging to module and column with non-zero values
module_gene_qualifier_mat = gene_qualifier_mat[rownames(gene_qualifier_mat) %in% module_genes,]
module_gene_qualifier_mat = module_gene_qualifier_mat[,colSums(module_gene_qualifier_mat)>0]
# Do PCA and extract 1st PC
pca_module = prcomp(module_gene_qualifier_mat, scale.=TRUE)
module_eigengene = pca_module$rotation[,'PC1']
names(module_eigengene) = colnames(module_gene_qualifier_mat)
# Plot module's most relevant qualifiers
plot_data = data.frame('qualifier'=names(module_eigengene), 'eigencentrality'=module_eigengene) %>%
top_n(25, wt=abs(eigencentrality))
print(plot_data %>% ggplot(aes(reorder(qualifier, abs(eigencentrality)), eigencentrality, fill=abs(eigencentrality))) +
geom_bar(position='identity', stat='identity') + scale_fill_viridis_c() + coord_flip() +
ggtitle(paste0('Top Qualifiers for Module ', i)) +
xlab('Qualifiers') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
}
rm(i, module_genes, module_gene_qualifier_mat, pca_module, module_eigengene, plot_data)